This notebook gives an example of how to map a single protein sequence to its structure, along with conducting sequence alignments and visualizing the mutations.
In [1]:
import sys
import logging
In [2]:
# Import the Protein class
from ssbio.core.protein import Protein
In [3]:
# Printing multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
Set the logging level in logger.setLevel(logging.<LEVEL_HERE>)
to specify how verbose you want the pipeline to be. Debug is most verbose.
CRITICAL
ERROR
WARNING
INFO
(default)DEBUG
In [4]:
# Create logger
logger = logging.getLogger()
logger.setLevel(logging.INFO) # SET YOUR LOGGING LEVEL HERE #
In [5]:
# Other logger stuff for Jupyter notebooks
handler = logging.StreamHandler(sys.stderr)
formatter = logging.Formatter('[%(asctime)s] [%(name)s] %(levelname)s: %(message)s', datefmt="%Y-%m-%d %H:%M")
handler.setFormatter(formatter)
logger.handlers = [handler]
Set these three things:
ROOT_DIR
PROTEIN_ID
will be createdPROTEIN_ID
PROTEIN_SEQ
A directory will be created in ROOT_DIR
with your PROTEIN_ID
name. The folders are organized like so:
ROOT_DIR
└── PROTEIN_ID
├── sequences # Protein sequence files, alignments, etc.
└── structures # Protein structure files, calculations, etc.
In [6]:
# SET FOLDERS AND DATA HERE
import tempfile
ROOT_DIR = tempfile.gettempdir()
PROTEIN_ID = 'SRR1753782_00918'
PROTEIN_SEQ = 'MSKQQIGVVGMAVMGRNLALNIESRGYTVSVFNRSREKTEEVIAENPGKKLVPYYTVKEFVESLETPRRILLMVKAGAGTDAAIDSLKPYLEKGDIIIDGGNTFFQDTIRRNRELSAEGFNFIGTGVSGGEEGALKGPSIMPGGQKDAYELVAPILTKIAAVAEDGEPCVTYIGADGAGHYVKMVHNGIEYGDMQLIAEAYSLLKGGLNLSNEELANTFTEWNNGELSSYLIDITKDIFTKKDEDGNYLVDVILDEAANKGTGKWTSQSALDLGEPLSLITESVFARYISSLKAQRVAASKVLSGPKAQPAGDKAEFIEKVRRALYLGKIVSYAQGFSQLRAASDEYHWDLNYGEIAKIFRAGCIIRAQFLQKITDAYAENADIANLLLAPYFKKIADEYQQALRDVVAYAVQNGIPVPTFSAAVAYYDSYRAAVLPANLIQAQRDYFGAHTYKRTDKEGIFHTEWLE'
In [7]:
# Create the Protein object
my_protein = Protein(ident=PROTEIN_ID, root_dir=ROOT_DIR, pdb_file_type='mmtf')
In [8]:
# Load the protein sequence
# This sets the loaded sequence as the representative one
my_protein.load_manual_sequence(seq=PROTEIN_SEQ, ident='WT', write_fasta_file=True, set_as_representative=True)
Out[8]:
In [9]:
# Mapping using BLAST
my_protein.blast_representative_sequence_to_pdb(seq_ident_cutoff=0.9, evalue=0.00001)
my_protein.df_pdb_blast.head()
Out[9]:
Out[9]:
In [10]:
# Download all mapped PDBs and gather the metadata
my_protein.download_all_pdbs()
my_protein.df_pdb_metadata.head(2)
Out[10]:
Out[10]:
In [11]:
# Set representative structures
my_protein.set_representative_structure()
Out[11]:
In [12]:
my_protein.__dict__
Out[12]:
In [13]:
# Input your mutated sequence and load it
mutated_protein1_id = 'N17P_SNP'
mutated_protein1_seq = 'MSKQQIGVVGMAVMGRPLALNIESRGYTVSVFNRSREKTEEVIAENPGKKLVPYYTVKEFVESLETPRRILLMVKAGAGTDAAIDSLKPYLEKGDIIIDGGNTFFQDTIRRNRELSAEGFNFIGTGVSGGEEGALKGPSIMPGGQKDAYELVAPILTKIAAVAEDGEPCVTYIGADGAGHYVKMVHNGIEYGDMQLIAEAYSLLKGGLNLSNEELANTFTEWNNGELSSYLIDITKDIFTKKDEDGNYLVDVILDEAANKGTGKWTSQSALDLGEPLSLITESVFARYISSLKAQRVAASKVLSGPKAQPAGDKAEFIEKVRRALYLGKIVSYAQGFSQLRAASDEYHWDLNYGEIAKIFRAGCIIRAQFLQKITDAYAENADIANLLLAPYFKKIADEYQQALRDVVAYAVQNGIPVPTFSAAVAYYDSYRAAVLPANLIQAQRDYFGAHTYKRTDKEGIFHTEWLE'
my_protein.load_manual_sequence(ident=mutated_protein1_id, seq=mutated_protein1_seq)
Out[13]:
In [14]:
# Input another mutated sequence and load it
mutated_protein2_id = 'Q4S_N17P_SNP'
mutated_protein2_seq = 'MSKSQIGVVGMAVMGRPLALNIESRGYTVSVFNRSREKTEEVIAENPGKKLVPYYTVKEFVESLETPRRILLMVKAGAGTDAAIDSLKPYLEKGDIIIDGGNTFFQDTIRRNRELSAEGFNFIGTGVSGGEEGALKGPSIMPGGQKDAYELVAPILTKIAAVAEDGEPCVTYIGADGAGHYVKMVHNGIEYGDMQLIAEAYSLLKGGLNLSNEELANTFTEWNNGELSSYLIDITKDIFTKKDEDGNYLVDVILDEAANKGTGKWTSQSALDLGEPLSLITESVFARYISSLKAQRVAASKVLSGPKAQPAGDKAEFIEKVRRALYLGKIVSYAQGFSQLRAASDEYHWDLNYGEIAKIFRAGCIIRAQFLQKITDAYAENADIANLLLAPYFKKIADEYQQALRDVVAYAVQNGIPVPTFSAAVAYYDSYRAAVLPANLIQAQRDYFGAHTYKRTDKEGIFHTEWLE'
my_protein.load_manual_sequence(ident=mutated_protein2_id, seq=mutated_protein2_seq)
Out[14]:
In [15]:
# Conduct pairwise sequence alignments
my_protein.pairwise_align_sequences_to_representative()
In [20]:
# View IDs of all sequence alignments
[x.id for x in my_protein.sequence_alignments]
# View the stored information for one of the alignments
my_alignment = my_protein.sequence_alignments.get_by_id('WT_Q4S_N17P_SNP')
my_alignment.annotations
str(my_alignment[0].seq)
str(my_alignment[1].seq)
Out[20]:
Out[20]:
Out[20]:
Out[20]:
In [21]:
# Summarize all the mutations in all sequence alignments
s,f = my_protein.sequence_mutation_summary(alignment_type='seqalign')
print('Single mutations:')
s
print('---------------------')
print('Mutation fingerprints')
f
Out[21]:
Out[21]:
In [22]:
import ssbio.databases.uniprot
In [23]:
this_examples_uniprot = 'P14062'
sites = ssbio.databases.uniprot.uniprot_sites(this_examples_uniprot)
my_protein.representative_sequence.features = sites
my_protein.representative_sequence.features
Out[23]:
In [24]:
# Returns a dictionary mapping sequence residue numbers to structure residue identifiers
# Will warn you if residues are not present in the structure
structure_sites = my_protein.map_seqprop_resnums_to_structprop_resnums(resnums=[1,3,45],
use_representatives=True)
structure_sites
Out[24]:
The awesome package nglview is utilized as a backend for viewing structures within a Jupyter notebook. ssbio
view functions will either return a NGLWidget
object, which is the same as using nglview
like the below example, or act upon the widget object itself.
# This is how NGLview usually works - it will load a structure file and return a NGLWidget "view" object.
import nglview
view = nglview.show_structure_file(my_protein.representative_structure.structure_path)
view
In [27]:
# View just the structure
view = my_protein.representative_structure.view_structure(recolor=True)
view
In [28]:
# Map the mutations on the visualization (scale increased) - will show up on the above view
my_protein.add_mutations_to_nglview(view=view, alignment_type='seqalign', scale_range=(4,7),
use_representatives=True)
In [29]:
# Add sites as shown above in the table to the view
my_protein.add_features_to_nglview(view=view, use_representatives=True)
In [30]:
import os.path as op
my_protein.save_json(op.join(my_protein.protein_dir, '{}.json'.format(my_protein.id)))